{
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9-final"
  },
  "orig_nbformat": 2,
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "source": [
    "# LSTM　Matchining(LSTM匹配)"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import modules\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM\n",
    "import nltk\n",
    "import torch\n",
    "import string"
   ]
  },
  {
   "source": [
    "## 读取数据"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "1000"
      ]
     },
     "metadata": {},
     "execution_count": 2
    }
   ],
   "source": [
    "datasets = pd.read_csv('datasets/DBLP-Scholar/candidates_sample.csv')\n",
    "len(datasets)"
   ]
  },
  {
   "source": [
    "### 查看数据格式"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "source": [
    "datasets.head(10)"
   ],
   "cell_type": "code",
   "metadata": {},
   "execution_count": 3,
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "   Unnamed: 0  _id                           ltable_id  \\\n",
       "0           5    5                    conf/sigmod/2000   \n",
       "1           6    6                    conf/sigmod/2000   \n",
       "2          16   16                    conf/sigmod/2000   \n",
       "3          35   35                    conf/sigmod/2002   \n",
       "4          40   40                    conf/sigmod/2002   \n",
       "5          42   42                    conf/sigmod/2002   \n",
       "6          51   51                    conf/sigmod/2002   \n",
       "7          55   55                    conf/sigmod/2002   \n",
       "8          81   81                    conf/sigmod/2003   \n",
       "9          93   93  conf/sigmod/AbadiCCCCEGHMRSSTXYZ03   \n",
       "\n",
       "                                           rtable_id  \\\n",
       "0                                       CpwYam_LYyEJ   \n",
       "1                                       F9KzvnDpCPUJ   \n",
       "2  url:http://portal.acm.org/citation.cfm%3Fid%3D...   \n",
       "3                                       QnMrKru1S1MJ   \n",
       "4  url:http://portal.acm.org/citation.cfm%3Fcoll%...   \n",
       "5  url:http://portal.acm.org/citation.cfm%3Fid%3D...   \n",
       "6  url:http://portal.acm.org/citation.cfm%3Fid%3D...   \n",
       "7                                       vfTH6FzOpDAJ   \n",
       "8  url:http://portal.acm.org/citation.cfm%3Fid%3D...   \n",
       "9                                       eBnT7lhV2LwJ   \n",
       "\n",
       "                                        ltable_title  \\\n",
       "0  proceedings of the 2000 acm sigmod internation...   \n",
       "1  proceedings of the 2000 acm sigmod internation...   \n",
       "2  proceedings of the 2000 acm sigmod internation...   \n",
       "3  proceedings of the 2002 acm sigmod internation...   \n",
       "4  proceedings of the 2002 acm sigmod internation...   \n",
       "5  proceedings of the 2002 acm sigmod internation...   \n",
       "6  proceedings of the 2002 acm sigmod internation...   \n",
       "7  proceedings of the 2002 acm sigmod internation...   \n",
       "8  proceedings of the 2003 acm sigmod internation...   \n",
       "9            aurora: a data stream management system   \n",
       "\n",
       "                                      ltable_authors  ltable_year  \\\n",
       "0                                                NaN         2000   \n",
       "1                                                NaN         2000   \n",
       "2                                                NaN         2000   \n",
       "3                                                NaN         2002   \n",
       "4                                                NaN         2002   \n",
       "5                                                NaN         2002   \n",
       "6                                                NaN         2002   \n",
       "7                                                NaN         2002   \n",
       "8                                                NaN         2003   \n",
       "9  d abadi, d carney, u ï¿½etintemel, m cherniack...         2003   \n",
       "\n",
       "                                        rtable_title  \\\n",
       "0  on predicting data cache behavior for real {ti...   \n",
       "1  similarity-based queries for time series data....   \n",
       "2  source international conference on management ...   \n",
       "3  blocation-based spatial queries,^ in proceedin...   \n",
       "4  full text pdf format pdf (424 kb)source intern...   \n",
       "5  source international conference on management ...   \n",
       "6  full text pdf format pdf (1.24 mb)source inter...   \n",
       "7  data modeling of time-based media. in: proceed...   \n",
       "8  source international conference on management ...   \n",
       "9  aurora: a data stream management system (demo ...   \n",
       "\n",
       "                                      rtable_authors  rtable_year  label  \n",
       "0                             c ferdinand, r wilhelm            0      0  \n",
       "1                              d raflei, a mendelzon         1997      0  \n",
       "2                           rt snodgrass, m winslett         1994      0  \n",
       "3          j zhang, m zhu, d papadias, y tao, dl lee         2003      0  \n",
       "4  c olston, a woodruff, a aiken, m chu, v ercego...         1998      0  \n",
       "5                                          c zaniolo         1986      0  \n",
       "6                           h garcia-molina, k salem         1987      0  \n",
       "7              s gibbs, c breiteneder, d tsichritzis         1994      0  \n",
       "8                                 h boral, pa larson         1988      0  \n",
       "9  d abadi, d carney, u cetintemel, m cherniack, c              0      1  "
      ],
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Unnamed: 0</th>\n      <th>_id</th>\n      <th>ltable_id</th>\n      <th>rtable_id</th>\n      <th>ltable_title</th>\n      <th>ltable_authors</th>\n      <th>ltable_year</th>\n      <th>rtable_title</th>\n      <th>rtable_authors</th>\n      <th>rtable_year</th>\n      <th>label</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>5</td>\n      <td>5</td>\n      <td>conf/sigmod/2000</td>\n      <td>CpwYam_LYyEJ</td>\n      <td>proceedings of the 2000 acm sigmod internation...</td>\n      <td>NaN</td>\n      <td>2000</td>\n      <td>on predicting data cache behavior for real {ti...</td>\n      <td>c ferdinand, r wilhelm</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>6</td>\n      <td>6</td>\n      <td>conf/sigmod/2000</td>\n      <td>F9KzvnDpCPUJ</td>\n      <td>proceedings of the 2000 acm sigmod internation...</td>\n      <td>NaN</td>\n      <td>2000</td>\n      <td>similarity-based queries for time series data....</td>\n      <td>d raflei, a mendelzon</td>\n      <td>1997</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>16</td>\n      <td>16</td>\n      <td>conf/sigmod/2000</td>\n      <td>url:http://portal.acm.org/citation.cfm%3Fid%3D...</td>\n      <td>proceedings of the 2000 acm sigmod internation...</td>\n      <td>NaN</td>\n      <td>2000</td>\n      <td>source international conference on management ...</td>\n      <td>rt snodgrass, m winslett</td>\n      <td>1994</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>35</td>\n      <td>35</td>\n      <td>conf/sigmod/2002</td>\n      <td>QnMrKru1S1MJ</td>\n      <td>proceedings of the 2002 acm sigmod internation...</td>\n      <td>NaN</td>\n      <td>2002</td>\n      <td>blocation-based spatial queries,^ in proceedin...</td>\n      <td>j zhang, m zhu, d papadias, y tao, dl lee</td>\n      <td>2003</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>40</td>\n      <td>40</td>\n      <td>conf/sigmod/2002</td>\n      <td>url:http://portal.acm.org/citation.cfm%3Fcoll%...</td>\n      <td>proceedings of the 2002 acm sigmod internation...</td>\n      <td>NaN</td>\n      <td>2002</td>\n      <td>full text pdf format pdf (424 kb)source intern...</td>\n      <td>c olston, a woodruff, a aiken, m chu, v ercego...</td>\n      <td>1998</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>42</td>\n      <td>42</td>\n      <td>conf/sigmod/2002</td>\n      <td>url:http://portal.acm.org/citation.cfm%3Fid%3D...</td>\n      <td>proceedings of the 2002 acm sigmod internation...</td>\n      <td>NaN</td>\n      <td>2002</td>\n      <td>source international conference on management ...</td>\n      <td>c zaniolo</td>\n      <td>1986</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>51</td>\n      <td>51</td>\n      <td>conf/sigmod/2002</td>\n      <td>url:http://portal.acm.org/citation.cfm%3Fid%3D...</td>\n      <td>proceedings of the 2002 acm sigmod internation...</td>\n      <td>NaN</td>\n      <td>2002</td>\n      <td>full text pdf format pdf (1.24 mb)source inter...</td>\n      <td>h garcia-molina, k salem</td>\n      <td>1987</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>55</td>\n      <td>55</td>\n      <td>conf/sigmod/2002</td>\n      <td>vfTH6FzOpDAJ</td>\n      <td>proceedings of the 2002 acm sigmod internation...</td>\n      <td>NaN</td>\n      <td>2002</td>\n      <td>data modeling of time-based media. in: proceed...</td>\n      <td>s gibbs, c breiteneder, d tsichritzis</td>\n      <td>1994</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>8</th>\n      <td>81</td>\n      <td>81</td>\n      <td>conf/sigmod/2003</td>\n      <td>url:http://portal.acm.org/citation.cfm%3Fid%3D...</td>\n      <td>proceedings of the 2003 acm sigmod internation...</td>\n      <td>NaN</td>\n      <td>2003</td>\n      <td>source international conference on management ...</td>\n      <td>h boral, pa larson</td>\n      <td>1988</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>9</th>\n      <td>93</td>\n      <td>93</td>\n      <td>conf/sigmod/AbadiCCCCEGHMRSSTXYZ03</td>\n      <td>eBnT7lhV2LwJ</td>\n      <td>aurora: a data stream management system</td>\n      <td>d abadi, d carney, u ï¿½etintemel, m cherniack...</td>\n      <td>2003</td>\n      <td>aurora: a data stream management system (demo ...</td>\n      <td>d abadi, d carney, u cetintemel, m cherniack, c</td>\n      <td>0</td>\n      <td>1</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {},
     "execution_count": 3
    }
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_arr = np.array(datasets)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "(1000, 11)"
      ]
     },
     "metadata": {},
     "execution_count": 5
    }
   ],
   "source": [
    "data_arr.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = data_arr[:,4:10]\n",
    "y = data_arr[:,10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "array(['a multimedia presentation algebra',\n",
       "       's adali, m sapino, v subrahmanian', 1999,\n",
       "       'a multimedia presentation algebra',\n",
       "       's adalä±, ml sapino, vs subrahmanian', 1999], dtype=object)"
      ]
     },
     "metadata": {},
     "execution_count": 7
    }
   ],
   "source": [
    "X[19]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1],\n",
       "      dtype=object)"
      ]
     },
     "metadata": {},
     "execution_count": 8
    }
   ],
   "source": [
    "y[0:20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "text = nltk.word_tokenize(str(X[0][0]))\n",
    "words = np.array([np.array(nltk.word_tokenize(str(x))) for x in X])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "array(['[', \"'proceedings\", 'of', 'the', '2000', 'acm', 'sigmod',\n",
       "       'international', 'conference', 'on', 'management', 'of', 'data',\n",
       "       ',', 'may', '16-18', ',', '2000', ',', 'dallas', ',', 'texas', ',',\n",
       "       \"usa'\", 'nan', '2000', \"'on\", 'predicting', 'data', 'cache',\n",
       "       'behavior', 'for', 'real', '{', 'time', 'systems', '.', 'in',\n",
       "       'proceedings', 'of', 'the', 'acm', 'sigplan', 'workshop', \"'\", \"'\",\n",
       "       'c', 'ferdinand', ',', 'r', 'wilhelm', \"'\", '0', ']'], dtype='<U13')"
      ]
     },
     "metadata": {},
     "execution_count": 10
    }
   ],
   "source": [
    "words[0]"
   ]
  },
  {
   "source": [
    "## 测试bert"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "source": [
    "### 加载预处理的Bert模型\n",
    "\n",
    "以下是如何获取支持pytorch支持的Bert模型\n",
    "\n",
    "1. [下载Bert模型](https://github.com/google-research/bert#pre-trained-models)\n",
    "\n",
    "2. [转换Bert模型支持pytorch](https://blog.csdn.net/weixin_41287060/article/details/105080705)"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 使用下载的本地模型,加快速度\n",
    "tokenizer = BertTokenizer.from_pretrained('cased_L-12_H-768_A-12')\n",
    "model = BertModel.from_pretrained('cased_L-12_H-768_A-12')"
   ]
  },
  {
   "source": [
    "### 获取隐藏层向量"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "text = str(words[0])\n",
    "tokenized_text = tokenizer.tokenize(text) #token初始化\n",
    "indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) #获取词汇表索引\n",
    "tokens_tensor = torch.tensor([indexed_tokens]) #将输入转化为torch的tensor\n",
    "with torch.no_grad(): #禁用梯度计算 因为只是前向传播获取隐藏层状态，所以不需要计算梯度\n",
    "    last_hidden_states = model(tokens_tensor)[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "torch.Size([1, 191, 768])"
      ]
     },
     "metadata": {},
     "execution_count": 17
    }
   ],
   "source": [
    "last_hidden_states[0].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "'conf/sigmod/2002'"
      ]
     },
     "metadata": {},
     "execution_count": 18
    }
   ],
   "source": [
    "data_arr[4][2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "import nltk\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "'proceedings of the 2002 acm sigmod international conference on management of data, madison, wisconsin, june 3-6, 2002'"
      ]
     },
     "metadata": {},
     "execution_count": 20
    }
   ],
   "source": [
    "data_arr[4][4]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "['Life',\n",
       " 'is',\n",
       " 'like',\n",
       " 'a',\n",
       " 'box',\n",
       " 'of',\n",
       " 'chocolate',\n",
       " ',',\n",
       " 'you',\n",
       " 'never',\n",
       " 'know',\n",
       " 'what',\n",
       " 'you',\n",
       " 'gon',\n",
       " 'na',\n",
       " 'get',\n",
       " '.']"
      ]
     },
     "metadata": {},
     "execution_count": 21
    }
   ],
   "source": [
    "nltk.word_tokenize('Life is like a box of chocolate, you never know what you gonna get.')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ]
}